More data and methods for metric learning

library(tidyverse)
library(dimRed)
library(reticulate)
library(here)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
library(ggforce)
library(ks)
library(patchwork)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)

Metric learning

Setup

train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))
# train <- readRDS(here::here("data/spdemand_3639id_notow_201length.rds"))
# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithm

The metric learning algorithm

metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
## Warning in matchPars(methodObject, list(...)): Parameter matching: perplexity is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: theta is not
## a standard parameter, ignoring.
summary(metric_isomap)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4     
## laplacian      112896 dgCMatrix S4

Variable kernel density estimate

For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_{H_i} (x - X_i).\]

Outlier plot based on density estimates

Fixed bandwidth

# fixed bandwidth
fn <- metric_isomap$embedding
E1 <- fn[,1] # rename as Ed to match the aesthetics in plot_ellipse()
E2 <- fn[,2]
prob <- c(1, 50, 99)
p_hdr_isomap <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL)
p_hdr_isomap_p <- p_hdr_isomap + 
  plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20, 
             color = blues9[5], fill = blues9[5], alpha = 0.2)
# p_hdr_isomap_p

Pointwise adaptive bandwidth

Rn <- metric_isomap$rmetric # array
n.grid <- 10
f <- vkde2d(x = fn[,1], y = fn[,2], h = Rn, n = n.grid)
plot_contour(metric_isomap, n.grid = n.grid, scale = 1/20)

source(here::here("R/sources/hdrplotting.R"))
p_isomap <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob, scale = 1/8)
(p_hdr_isomap_p + p_isomap$p ) + coord_fixed()

t-SNE

x <- train
method <- "anntSNE"
perplexity <- round(k / 3) # 30 by default
theta <- 0 # for exact tSNE in the C++ tSNE Barnes-Hut implementation
# tictoc::tic()
metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
# summary(metric_tsne)
# tictoc::toc()
ftsne <- metric_tsne$embedding
E1 <- ftsne[,1]; E2 <- ftsne[,2]
# plot_embedding(metric_tsne)
# plot_ellipse(metric_tsne, n.plot = 50)
plot_contour(metric_tsne, n.grid = 50, scale = 1/20)

UMAP

x <- train
method <- "annUMAP"
# perplexity <- round(k / 3) # 30 by default
theta <- 0 # for exact tSNE in the C++ tSNE Barnes-Hut implementation
# tictoc::tic()
metric_umap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
# summary(metric_umap)
# tictoc::toc()
fumap <- metric_umap$embedding
E1 <- fumap[,1]; E2 <- fumap[,2]
# plot_embedding(metric_umap)
# plot_ellipse(metric_umap, n.plot = 50)
plot_contour(metric_umap, n.grid = 50, scale = 1/20)

Compare outliers from Isomap and t-SNE

To compare two manifold learning methods, we calculate the overlap among outliers. To do so, we first rank the outliers from highest to lowest densities. Then calculate the correlation of ranks for both methods.

To show our methods performs better, we expect a higher correlation of ranks. This shows robustness in finding outliers using different manifold learning methods.

(p_isomap$p + p_tsne$p + p_umap$p) + coord_fixed()

Outlier ranking comaprison using variable bandwidth

The ranking of outliers can be compared as follows.

p_isomap_all <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob, noutliers = N, scale = 1/20)
p_tsne_all <- plot_outlier(x = metric_tsne, n.grid = 50, prob = prob, noutliers =N, scale = 1/20)
p_umap_all <- plot_outlier(x = metric_umap, n.grid = 50, prob = prob, noutliers =N, scale = 1/20)


head(p_isomap_all$outlier, n = 20)
##  [1] "135" "323" "228" "17"  "35"  "274" "324" "326" "39"  "328" "182" "38" 
## [13] "183" "327" "218" "134" "185" "231" "237" "279"
head(p_tsne_all$outlier, n = 20)
##  [1] "299" "296" "300" "297" "298" "200" "199" "157" "158" "118" "117" "98" 
## [13] "116" "99"  "188" "189" "186" "187" "100" "248"
head(p_umap_all$outlier, n = 20)
##  [1] "17"  "16"  "257" "274" "208" "244" "113" "3"   "195" "51"  "245" "65" 
## [13] "148" "131" "256" "291" "306" "112" "99"  "209"
outlier_isomap <- order(as.numeric(p_isomap_all$outlier))
outlier_tsne <- order(as.numeric(p_tsne_all$outlier))
# outlier_umap <- order(as.numeric(p_umap_all$outlier))

# outliers <- cbind(outlier_isomap, outlier_tsne)
cor(outlier_isomap, outlier_tsne)
## [1] -0.06160876
# ggcorrplot::ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
topconfects::rank_rank_plot(outlier_isomap, outlier_tsne, n=50)

library(topconfects)
a <- sample(letters)
b <- sample(letters)
rank_rank_plot(a,b, n=20)

Outlier ranking comaprison using fixed bandwidth

head(p_isomap$outlier, n = 20)
##  [1] "135" "326" "38"  "182" "39"  "134" "324" "37"  "328" "327" "276" "183"
## [13] "181" "228" "278" "237" "133" "323" "86"  "231"
head(p_hdr_tsne$outlier, n = 20)
##  [1] "157" "296" "299" "158" "300" "297" "252" "237" "303" "298" "118" "130"
## [13] "246" "117" "98"  "302" "322" "253" "179" "254"
hdroutlier_isomap <- order(as.numeric(p_isomap$outlier))
hdroutlier_tsne <- order(as.numeric(p_hdr_tsne$outlier))
# hdroutliers <- cbind(hdroutlier_isomap, hdroutlier_tsne)
cor(hdroutlier_isomap, hdroutlier_tsne)
## [1] 0.1925199
cor(hdroutlier_isomap %>% head(50), hdroutlier_tsne %>% head(50))
## [1] 0.2977834
topconfects::rank_rank_plot(hdroutlier_isomap, hdroutlier_tsne, n=50)

Compare high density regions

Similarly, we compare the highest density regions from both methods and expect a similar output in detecting most typical observations.

Variable bandwidth

tail(p_isomap_all$outlier, n = 20)
##  [1] "166" "78"  "66"  "25"  "262" "176" "223" "125" "23"  "20"  "311" "261"
## [13] "121" "77"  "222" "67"  "175" "167" "260" "163"
tail(p_tsne_all$outlier, n = 20)
##  [1] "277" "138" "88"  "37"  "140" "42"  "84"  "326" "43"  "137" "276" "36" 
## [13] "86"  "41"  "89"  "85"  "44"  "181" "90"  "139"
cor(outlier_isomap %>% tail(50), outlier_tsne %>% tail(50))
## [1] -0.0488914

Fixed bandwidth

tail(p_isomap$outlier, n = 20)
##  [1] "303" "167" "163" "78"  "260" "118" "166" "67"  "254" "164" "31"  "261"
## [13] "158" "175" "223" "32"  "222" "20"  "176" "23"
tail(p_hdr_tsne$outlier, n = 20)
##  [1] "263" "317" "126" "310" "262" "261" "80"  "89"  "319" "139" "312" "168"
## [13] "318" "24"  "268" "269" "30"  "81"  "90"  "211"
cor(hdroutlier_isomap %>% tail(50), hdroutlier_tsne %>% tail(50))
## [1] 0.3175251